#coding:utf-8
'''一个小玩具...使用说明：(1)将文章地址写到下面url中; (2)只适用焦大博客的文章 '''
import urllib2,re,jieba,jieba.analyse

url = "http://www.jiaodaseo.com/topic/jishu82.html"

headers = {
    "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36",
    "Cookie":"yunsuo_session_verify=b35d26d9c5ebce86d1cf71a11bb31ac8"
    }
response = urllib2.Request(url = url,headers =headers)
html = urllib2.urlopen(response).read()

def search(req,html):
    content = re.search(req,html)
	return content

#提取正文内容正则表达式
area_content = re.compile('<div class="artCon">([\s\S]*?)<div class="pages">')	
content = search(area_content,html).group(1)

#去除正文html标签
dr = re.compile(r'<[^>]+>',re.S)
content = dr.sub('',content).replace('nbsp','').replace('hellip','')

#获取文章标题
title = search('<h1 class="heading">(.*?)</h1>',html).group(1)

#获取整个正文内容，赋予标题3倍权重
mainbody = title * 2 + content

tags = jieba.analyse.extract_tags(mainbody, topK=5)

print title ,u'标签:' +'\n'
print ','.join(tags).replace('seo,','')